import re
import time
import json
import requests
from bs4 import BeautifulSoup
from pypinyin import lazy_pinyin

error = 0
company = []
herf = []
sorted_str = []

bans_key = [
    '恒大', '教育', '银行', '单位', '机械', '研究所', '集团',
    '小组', '政府', '实验室', '投资', '房地产', '光学',
    '医疗', '科学院', '金融', '人力资源', '工厂', '中心',
    '大学', '学院', '中学', '烟草', '组织', '传媒', '学校',
    '物业', '机电', '保险', '证券', '管理', '研究院', '销售',
    '事务所', '建筑', '石油', '物流', '航空', '房产', '化工',
    '石化', '环境', '农业', '出口', '制冷', '知识产权', '食品',
    '仪器', '材料', '海洋', '生物', '贸易', '商贸', '地产',
    '照明', '培训', '电器', '碧桂园', '服务', '日化', '青岛',
    '酒店', '能源', '媒体', '办事处', '设计院', '控股', '国投',
    '咨询', '啤酒', '陶瓷', '福建', '商务', '建设', '创意', '饮料',
    '电商', '中共', '工程', '招聘会', '双选会', '药', '核电', '电梯',
    '部队', '文具', '涂料', '塑胶', '信用社', '置业', '营销', '策划',
    '旅游', '旅行', '乳', '中建', '局', '珠宝', '租赁', '烟台', '铝',
    '钢', '期货', '客车', '机电', '五金', '塑', '金属', '家居', '医院',
    '空调', '家具', '超市', '生态', '时装', '医', '衣', '电力', '电气',
    '娱乐', '鞋', '联合国', '办公室', '化学', '环保', '体育', '评估',
    '水利', '文化', '化妆', '橡胶', '轴承', '公安', '酒', '农', '实业',
    '工业', '服饰', '茶', '餐饮', '光电', '商业'
]

ban_city = [
    '北京', '上海', '昆明', '株洲', '东莞', '内蒙古',
    '郑州', '北京', '南京', '武汉', '长沙', '南宁',
    '广西', '宁夏', '新疆', '河北', '山西', '辽宁', '吉林', '黑龙江',
    '安徽', '福建', '江西', '山东', '河南', '湖北', '湖南', '河南',
    '湖北', '湖南', '广东', '海南', '云南', '陕西', '甘肃', '青海',
    '江西', '吉林', '宁夏', '昆明', '陕西', '宜宾', '义乌', '广西',
    '桂林', '沈阳', '湖南', '湖北', '河北', '河南', '安徽', '莆田',
    '新疆', '乌鲁木齐', '佛山', '山东', '资阳', '云南', '海南',
    '中山', '福州', '赣州', '合肥', '惠州', '湖州', '嘉兴', '济南',
    '宁波', '绍兴', '石家庄', '苏州', '天津', '无锡', '芜湖', '厦门',
]

must_have_keyword = [
    '2020',
    '2021'
]

search_content_keyword = [
    'linux',
    '嵌入式',
    'arm',
    'rtos',
    'uboot',
    'kernel',
    'python',
    'fpga',
    '电子信息',
    'c[+][+]'
]

# 使用前改 cookies 页数


def getinfo_chengdian():
    names = []
    urls = []
    global company
    global herf
    global error

    mystart = time.time()
    print('开始获取成电招聘信息...')

    for page in range(1, 400):
        url = 'https://jiuye.uestc.edu.cn/sys/fore.php?op=listRecruit&callback=jQuery183013575194361081655_1636513006101'
        headers = {
            'Host': 'jiuye.uestc.edu.cn',
            'Connection': 'keep-alive',
            'Content-Length': '16',
            'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
            'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
            'sec-ch-ua-mobile': '?0',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
            'sec-ch-ua-platform': '"Windows"',
            'Origin': 'https://jiuye.uestc.edu.cn',
            'Sec-Fetch-Site': 'same-origin',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Dest': 'empty',
            'Referer': 'https://jiuye.uestc.edu.cn/career/info/otherRec.html',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
            'Cookie': 'PHPSESSID=p4lulu8fk21eo05cfoevfbf7n4'
        }
        data = {
            'rec_way': '2',
            'page': str(page)
        }

        try:
            r = requests.post(url, headers=headers, data=data)
            r.encoding = 'utf-8'
            list = json.loads(re.findall(r'[(](.*)[)]', r.text)[0])
            for i in list['data']:
                names.append(i['rec_enter_name'])
                urls.append(
                    'https://jiuye.uestc.edu.cn/career/info/Recruitment.html?id='+str(i['rec_No'])+'&rectype=1')
        except:
            error += 1
            print('成电: error happen in page', page, 'error', error)

        print('成电: page success', page)

    with open('markdown/jobs_成电.md', 'w', encoding='utf-8') as f:
        print('成电: 保存公司到 markdown/jobs_成电.md ...')
        for i in range(len(names)):
            f.write('[{}]({})\r\n'.format(names[i], urls[i]))
        print('成电: 保存完毕')

    company += names
    herf += urls
    print('成电: 公司数量:{} 时间:{}s'.format(
        len(names), round(time.time() - mystart, 2)))
    time.sleep(2)

# 使用前改count cookies 页数


def getinfo_hangdian():
    names = []
    urls = []
    global company
    global herf
    global error

    count = 1614498384032

    mystart = time.time()
    print('getinfo_hangdian： 开始获取杭电招聘信息...')

    for page in range(1, 440):
        count += 1
        url = 'http://career.hdu.edu.cn/module/getonlines?start_page=1&k=&recruit_type=&count=15&start=' + \
            str(page) + '&_=' + str(count)
        headers = {
            'Host': 'career.hdu.edu.cn',
            'Connection': 'keep-alive',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Referer': 'http://career.hdu.edu.cn/module/onlines',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
            'Cookie': 'WEYON=d64b12c5740836b0c94ddf150b5c3b3b; Hm_lvt_9d62b12bee08da154fa2a80d8ee90355=1636513619; Hm_lpvt_9d62b12bee08da154fa2a80d8ee90355=1636513670'
        }

        try:
            r = requests.get(url, headers=headers)
            r.encoding = 'utf-8'
            list = json.loads(r.text)
            for i in list['data']:
                names.append(i['company_name'])
                urls.append('http://career.hdu.edu.cn/detail/online?id=' +
                            i['recruitment_id'] + '&menu_id=')
        except:
            error += 1
            print('getinfo_hangdian: error happen in page', page, 'error', error)

        print('杭电: page success', page)

    with open('markdown/jobs_杭电.md', 'w', encoding='utf-8') as f:
        print('杭电: 保存公司到 markdown/jobs_杭电.md ...')
        for i in range(len(names)):
            f.write('[{}]({})\r\n'.format(names[i], urls[i]))
        print('杭电: 保存完毕')

    company += names
    herf += urls

    print('杭电: 公司数量:{} 时间:{}s'.format(
        len(names), round(time.time() - mystart, 2)))
    time.sleep(2)

# 使用前改cookies 页数


def getinfo_haida():
    names = []
    urls = []
    global company
    global herf
    global error

    mystart = time.time()
    print('getinfo_haida： 开始获取海大招聘信息...')

    headers = {
        'Host': 'jyxt.hainanu.edu.cn',
        'Connection': 'keep-alive',
        'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Referer': 'https://jyxt.hainanu.edu.cn/home/index/emplory/mcit/MDAwMDAwMDAwMJG6n3_Ed6imi4qQtMOLl9WQqcyvyGLMrLqgu86Bn4xmirmjqsKIjteZeoywxJ10og.html',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
        'Cookie': 'UM_distinctid=17d07b93132427-0d93cfc2ea4f92-57b1a33-1fa400-17d07b931337fb; PHPSESSID=ST-24177-XQWQnfCg7V7fPxEPd1AE1636525281379-CxFf-cas'
    }

    for page in range(1, 181):
        url = 'https://jyxt.hainanu.edu.cn/Index/emplory/mcit/MDAwMDAwMDAwMJG6n3_Ed6imi4qQsMOen9WKqLKvsoizpsp5u9GGrZ6ujs6WZMKGcdKLoJDStXqn1ZDPqaE/page/' + \
            str(page)+'.html'

        try:
            r = requests.get(url, headers=headers)
            r.encoding = 'utf-8'
            soup = BeautifulSoup(r.text, 'html.parser')
            a = soup.select('ul.list-unstyled a')

            for i in a:
                names.append(i.string.split('\xa0-\xa0')[0])
                urls.append('https://jyxt.hainanu.edu.cn'+i['href'])
        except:
            error += 1
            print('海大: error happen in page', page, 'error', error)

        print('海大: page success', page)

    with open('markdown/jobs_海大.md', 'w', encoding='utf-8') as f:
        print('海大: 保存公司到 markdown/jobs_海大.md ...')
        for i in range(len(names)):
            f.write('[{}]({})\r\n'.format(names[i], urls[i]))
        print('海大: 保存完毕')

    company += names
    herf += urls

    print('海大: 公司数量:{} 时间:{}s'.format(
        len(names), round(time.time() - mystart, 2)))
    time.sleep(2)


def getinfo_chengxin():
    names = []
    urls = []
    global company
    global herf
    global error

    mystart = time.time()
    print('getinfo_haida： 开始获取成信招聘信息...')

    headers = {
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
        'auth': 'Baisc MTAyNDY6MTAyNDY=',
        'content-length': '173',
        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'origin': 'https://jy.cuit.edu.cn',
        'referer': 'https://jy.cuit.edu.cn/',
        'sec-ch-ua': '"Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'cross-site',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }

    for page in range(1, 109):
        url = 'https://a.jiuyeb.cn/mobile.php/enrollment/getlist'
        data = {
            'school_id': 'f71533c1-f48f-6b55-36af-e9e833d3794d',
            'page': str(page),
            'size': '20',
            'login_user_id': '1',
            'login_admin_school_code': '10621',
            'login_admin_school_id': 'f71533c1-f48f-6b55-36af-e9e833d3794d'
        }

        try:
            r = requests.post(url, headers=headers, data=data)
            r.encoding = 'utf-8'
            list = json.loads(r.text)
            for i in list['data']['list']:
                names.append(i['remarks'])
                urls.append(
                    'https://jy.cuit.edu.cn/Zhaopin/xiaozhao.html?id=' + str(i['id']) + '&type=1')
        except:
            error += 1
            print('成信: error happen in page', page, 'error', error)

        print('成信: page success', page)

    with open('markdown/jobs_成信.md', 'w', encoding='utf-8') as f:
        print('成信: 保存公司到 markdown/jobs_成信.md ...')
        for i in range(len(names)):
            f.write('[{}]({})\r\n'.format(names[i], urls[i]))
        print('成信: 保存完毕')

    company += names
    herf += urls
    print('成信: 公司数量:{} 时间:{}s'.format(
        len(names), round(time.time() - mystart, 2)))
    time.sleep(2)

# 排序去重保存


def sort_save():
    mystart = time.time()
    print('sort_save: 开始排序去重保存')
    global company
    global herf
    global sorted_str
    hanzi_list = []
    sort_company = []

    for i in range(len(company)):
        if company[i] not in sort_company:
            sort_company.append(company[i])
            hanzi_list.append('[{}]({})'.format(company[i], herf[i]))

    hanzi_list_pinyin = []
    hanzi_list_pinyin_alias_dict = {}
    for single_str in hanzi_list:
        py_r = lazy_pinyin(single_str)
        single_str_py = ''
        for py_list in py_r:
            single_str_py = single_str_py+py_list
        hanzi_list_pinyin.append(single_str_py)
        hanzi_list_pinyin_alias_dict[single_str_py] = single_str
    hanzi_list_pinyin.sort()
    sorted_hanzi_list = []
    for single_str_py in hanzi_list_pinyin:
        sorted_hanzi_list.append(hanzi_list_pinyin_alias_dict[single_str_py])

    sorted_str = sorted_hanzi_list
    with open('markdown/all.md', 'w', encoding='utf-8') as f:
        print('sort_save: 保存 sorted_hanzi_list 到 markdown/all_exclude.md ...')
        for i in sorted_hanzi_list:
            f.write(i+'\n')
        print('sort_save: 保存完毕')

    print('sort_save: 函数用时:{}s'.format(round(time.time()-mystart, 2)))
    time.sleep(2)

# 排除关键词


def exclude():
    global sorted_str
    mystart = time.time()
    print('exclude: 开始排除关键词')

    # bans去重
    bans = bans_key + ban_city
    new = []
    for i in bans:
        if i not in new:
            new.append(i)
    bans = new

    count = 0
    contain = []

    for line in sorted_str:
        flag = 0
        for ban in bans:
            if len(re.findall(ban, line)) != 0:
                flag = 1

        if flag == 0:
            contain.append(line)
        else:
            print('exclude: 排除公司', line)
            count += 1

    print('exclude: 已排除公司数:{} 筛选出公司数:{}'.format(count, len(contain)))

    sorted_str = contain
    with open('markdown/all_exclude.md', 'w', encoding='utf-8') as f:
        print('exclude: 保存 contain 到 markdown/all_exclude.md ...')
        for line in contain:
            f.write(line + '\n')
        print('exclude: 保存完毕')

    print('exclude: 函数用时:{}s'.format(round(time.time()-mystart, 2)))
    time.sleep(2)


if __name__ == '__main__':
    start = time.time()
    # 获取招聘信息
    getinfo_chengxin()
    getinfo_haida()
    getinfo_chengdian()
    getinfo_hangdian()
    sort_save()
    exclude()
    print('main: 时间:{}s'.format(round(time.time() - start, 2)))
